欢迎关注“小丫画图”公众号,同名知识星球等你加入
小丫微信: epigenomics E-mail: figureya@126.com
作者:LiYin
小丫编辑校验
visualizing genomic variations, including mutation patterns, copy number variations (CNVs), expression patterns, and methylation patterns.
支持hg18、hg19、mm9、mm10
例如像例文那样展示多个芯片基因的表达变化和之间的link关系
参考资料:https://journals.sagepub.com/doi/10.4137/CIN.S13495
https://www.bioconductor.org/packages/devel/bioc/html/OmicCircos.html
使用国内镜像安装包
options("repos"= c(CRAN="https://mirrors.tuna.tsinghua.edu.cn/CRAN/"))
options(BioC_mirror="http://mirrors.ustc.edu.cn/bioc/")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("OmicCircos", version = "3.8")
加载包
library(S4Vectors)
## Loading required package: stats4
## Loading required package: BiocGenerics
## Loading required package: parallel
##
## Attaching package: 'BiocGenerics'
## The following objects are masked from 'package:parallel':
##
## clusterApply, clusterApplyLB, clusterCall, clusterEvalQ,
## clusterExport, clusterMap, parApply, parCapply, parLapply,
## parLapplyLB, parRapply, parSapply, parSapplyLB
## The following objects are masked from 'package:stats':
##
## IQR, mad, sd, var, xtabs
## The following objects are masked from 'package:base':
##
## anyDuplicated, append, as.data.frame, basename, cbind,
## colMeans, colnames, colSums, dirname, do.call, duplicated,
## eval, evalq, Filter, Find, get, grep, grepl, intersect,
## is.unsorted, lapply, lengths, Map, mapply, match, mget, order,
## paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind,
## Reduce, rowMeans, rownames, rowSums, sapply, setdiff, sort,
## table, tapply, union, unique, unsplit, which, which.max,
## which.min
##
## Attaching package: 'S4Vectors'
## The following object is masked from 'package:base':
##
## expand.grid
library(tidyverse)
## ── Attaching packages ────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.3.0
## ✔ tibble 2.0.1 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## Warning: package 'tibble' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.2
## ── Conflicts ───────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks BiocGenerics::combine()
## ✖ tidyr::expand() masks S4Vectors::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::first() masks S4Vectors::first()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::Position() masks BiocGenerics::Position(), base::Position()
## ✖ dplyr::rename() masks S4Vectors::rename()
library(OmicCircos)
## Loading required package: GenomicRanges
## Loading required package: IRanges
##
## Attaching package: 'IRanges'
## The following objects are masked from 'package:dplyr':
##
## collapse, desc, slice
## The following object is masked from 'package:purrr':
##
## reduce
## Loading required package: GenomeInfoDb
Sys.setenv(LANGUAGE = "en") #显示英文报错信息
options(stringsAsFactors = FALSE) #禁止chr转成factor
如果你的数据已经保存为easy_input.csv格式,就可以跳过这步,进入“开始画图”
gene.location.csv,基因在染色体上的位置。至少包括基因所在的染色体、起始位置、基因 ID(例如ensembl ID、ENTREZ ID、gene symbol)。此处以人为例,对应hg19版本,来自TCGAbiolinks。
signal.csv,图中基因的信号值。第一列是gene ID,跟gene.location.csv里的一种基因ID一致,此处是gene symbol;第二列是图中基因名及其连线的颜色,红色连红色,蓝色连蓝色,你也可以写成其他颜色;第三列之后是三组样品的signal,第一组有两个,第二组有三个,第三组有三个。可以是RNA-seq、ChIP-seq、ATAC-seq、CNV、DNA甲基化等信息。用感兴趣的区段信息替换示例中的基因信息。
gene.location <- read.csv("gene.location.csv")
head(gene.location)
## chromosome_name start_position end_position strand ensembl_gene_id
## 1 13 23551994 23552136 -1 ENSG00000223116
## 2 13 23708313 23708703 1 ENSG00000233440
## 3 13 23726725 23726825 -1 ENSG00000207157
## 4 13 23743974 23744736 -1 ENSG00000229483
## 5 13 23791571 23791673 -1 ENSG00000252952
## 6 13 23817659 23821323 1 ENSG00000235205
## entrezgene external_gene_id
## 1 NA AL157931.1
## 2 NA HMGA1P6
## 3 NA RNY3P4
## 4 NA LINC00362
## 5 100873766 RNU6-58P
## 6 NA TATDN2P3
easy_input <- read.csv('signal.csv')
head(easy_input)
## id color group1_s1 group1_s2 group2_s1 group2_s2 group2_s3
## 1 PPAPDC1A black 1.639 0.214 -0.377 -3.097 1.393
## 2 SHC4 black 1.631 -0.505 -0.651 -0.973 -0.048
## 3 ZNF552 black 0.240 -1.252 -1.879 -0.195 -1.691
## 4 IL12RB2 black -0.141 2.780 2.338 1.308 0.415
## 5 ABAT black -0.981 -0.896 -2.019 1.412 -1.366
## 6 WWP1 black -1.345 -0.620 0.389 0.301 -0.321
## group3_s1 group3_s2 group3_s3
## 1 0.911 1.320 -0.154
## 2 -0.666 0.503 -0.349
## 3 -0.012 -1.133 0.354
## 4 1.321 0.025 2.385
## 5 -1.994 -0.636 -0.885
## 6 -0.900 -1.251 -0.264
# 提取基因在染色体上的位置
gene_list <- easy_input$id
# 此处根据gene symbol提取位置信息
gene_list_chr <- gene.location[gene.location$external_gene_id %in% gene_list,]
# 只保留三列:染色体、起始位置、gene symbol
gene_list_chr <- gene_list_chr[,c(1,2,7)]
colnames(gene_list_chr) <- c('chr','position','id')
head(gene_list_chr)
## chr position id
## 3145 9 2621834 VLDLR
## 4198 21 31586324 CLDN8
## 5908 7 142829170 PIP
## 6715 6 137464968 IL22RA2
## 7146 15 28000021 OCA2
## 8293 17 37366789 STAC2
# 合并染色体位置和signal
gene_chr_fc <- merge(gene_list_chr, easy_input, by='id')
gene_chr_fc <- gene_chr_fc[!duplicated(gene_chr_fc$id),]
write.csv(gene_chr_fc, "easy_input.csv", row.names = F)
gene_chr_fc <- read.csv("easy_input.csv")
head(gene_chr_fc)
## id chr position color group1_s1 group1_s2 group2_s1 group2_s2
## 1 ABAT 16 8768422 black -0.981 -0.896 -2.019 1.412
## 2 ABCC8 11 17414432 black -2.030 3.128 -3.019 2.328
## 3 AFF3 2 100162323 black -1.376 1.755 1.066 1.073
## 4 AGBL2 11 47681143 black -2.572 -0.516 -1.120 2.709
## 5 AGTR1 3 148415571 black 0.160 2.922 -0.966 -2.609
## 6 AKR7A3 1 19609052 black -1.560 1.790 -1.542 1.223
## group2_s3 group3_s1 group3_s2 group3_s3
## 1 -1.366 -1.994 -0.636 -0.885
## 2 2.058 -2.613 0.514 -2.572
## 3 -1.844 -0.392 -0.842 -0.760
## 4 -0.210 -1.800 -0.300 -0.709
## 5 -0.456 -2.007 -0.802 -2.035
## 6 1.252 0.743 2.022 -0.871
# 画三组样品的热图,分3层来
# 若不分3层,热图则会挨在一起,无法分开
exp1 <- gene_chr_fc[,c(2,3,1,5,6)]
head(exp1)
## chr position id group1_s1 group1_s2
## 1 16 8768422 ABAT -0.981 -0.896
## 2 11 17414432 ABCC8 -2.030 3.128
## 3 2 100162323 AFF3 -1.376 1.755
## 4 11 47681143 AGBL2 -2.572 -0.516
## 5 3 148415571 AGTR1 0.160 2.922
## 6 1 19609052 AKR7A3 -1.560 1.790
exp2 <- gene_chr_fc[,c(2,3,1,7,8,9)]
exp3 <- gene_chr_fc[,c(2,3,1,10,11,12)]
处理基因名字为一一对应关系,红色连红色,蓝色连蓝色
gene_label <- gene_chr_fc[, c(2,3,1,4)]
# 按照1-22,x,y排序
# 有的数据没有X或Y,小鼠只有1:20也没关系,不用改
gene_label$chr <- factor(gene_label$chr, levels = c(1:22, "X", "Y"))
# 下面去掉多余的factor。也可以不运行这行,对结果没影响
gene_label$chr <- droplevels(gene_label$chr, exclude = if(anyNA(levels(gene_label$chr))) NULL else NA)
# 排序
gene_label <- gene_label[order(gene_label$chr),]
tail(gene_label)
## chr position id color
## 106 21 35885440 RCAN1 black
## 109 21 43159529 RIPK4 black
## 7 22 39348746 APOBEC3A black
## 127 22 29190543 XBP1 black
## 37 X 13587724 EGFL6 red
## 48 X 13789150 GPM6B red
# 红色基因
gene_link1 <- filter(gene_label, gene_label$color == 'red')
gene_link1 <- gene_link1[, -4]
head(gene_link1)
## chr position id
## 1 3 128628709 KIAA1257
## 2 6 19837617 ID4
## 3 11 72465774 STARD10
## 4 X 13587724 EGFL6
## 5 X 13789150 GPM6B
id11 <- data_frame(id = combn(gene_link1$id, 2)[1, ])
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
head(id11)
## # A tibble: 6 x 1
## id
## <chr>
## 1 KIAA1257
## 2 KIAA1257
## 3 KIAA1257
## 4 KIAA1257
## 5 ID4
## 6 ID4
id1.1 <- data_frame(id1.1 = combn(gene_link1$id, 2)[2, ])
genelink1 <- gene_link1 %>%
right_join(id11)
## Joining, by = "id"
genelink11 <- gene_link1 %>%
rename(id1.1=id) %>%
right_join(id1.1)
## Joining, by = "id1.1"
gene_link1 <- cbind(genelink1, genelink11)
# 蓝色基因
gene_link2 <- filter(gene_label, gene_label$color == 'blue')
gene_link2 <- gene_link2[, -4]
id22 <- data_frame(id=combn(gene_link2$id, 2)[1, ])
id2.1<- data_frame(id2.1=combn(gene_link2$id, 2)[2, ])
genelink2 <- gene_link2 %>%
right_join(id22)
## Joining, by = "id"
genelink22 <- gene_link2 %>%
rename(id2.1=id) %>%
right_join(id2.1)
## Joining, by = "id2.1"
gene_link2 <- cbind(genelink2, genelink22)
R代表半径位置,W是宽度,cir是染色体
图中的三组样品的文字需要用AI进行标记
包里自带的写染色体名字的位置不合理,也没有参数可以调整,这里提供两种方法来处理:
pdf('OmicCircos.pdf',width = 8,height = 8)
par(mar=c(0, 0, 0, 0)); #准备画板
plot(c(1,800), c(1,800), type="n", axes=FALSE, xlab="", ylab="", main="");
#画染色体scale
circos(R=250, cir="hg19", W=5, type="chr",
print.chr.lab=FALSE, #自带的名字会跟scale重叠
scale=TRUE)
#画染色体名字
#染色体长度信息,从https://genome.ucsc.edu/goldenPath/help/hg19.chrom.sizes下载,然后整理而成
chr_info <- read.table("hg19.chrom.sizes.txt")
head(chr_info)
## V1 V2
## 1 1 249250621
## 2 2 243199373
## 3 3 198022430
## 4 4 191154276
## 5 5 180915260
## 6 6 171115067
chr_label <- data.frame(chr = chr_info$V1, position = (chr_info$V2 + 1)/2, id = chr_info$V1)
circos(R=270, cir="hg19", W=2, mapping=chr_label, type="label2", col = "black", side="in", cex=0.5);
#画基因名字,side="in"则基因在圈里面
circos(R=280, cir="hg19", W=10, mapping=gene_label, type="label", lwd = 0.4, col = gene_label$color,
side="out", cex=0.5);
#画第1个热图
circos(R=210, cir="hg19", W=40, mapping=exp1, type="heatmap2",
cluster=FALSE, col.bar=FALSE, col=NULL,scale = TRUE)
## Warning in circos(R = 210, cir = "hg19", W = 40, mapping = exp1, type =
## "heatmap2", : NAs introduced by coercion
#画第2个热图
circos(R=160, cir="hg19", W=60, mapping=exp2, type="heatmap2",
cluster=FALSE, col.bar=FALSE, col=NULL,scale = TRUE)
## Warning in circos(R = 160, cir = "hg19", W = 60, mapping = exp2, type =
## "heatmap2", : NAs introduced by coercion
#画第3个热图
circos(R=110, cir="hg19", W=60, mapping=exp3, type="heatmap2",
cluster=FALSE, col.bar=FALSE, col=NULL,scale = TRUE)
## Warning in circos(R = 110, cir = "hg19", W = 60, mapping = exp3, type =
## "heatmap2", : NAs introduced by coercion
#画link
circos(R=100, cir="hg19", W=10, mapping=gene_link2, type="link2", lwd=2, col='blue')
circos(R=100, cir="hg19", W=10, mapping=gene_link1, type="link2", lwd=2, col='red')
dev.off()
## quartz_off_screen
## 2
修改了染色体名字的位置,顺便修改了scale的位置。
修改后的函数命名为circos_plus,保存为sourceOmiccircos.R,位于当前文件夹。查找#Ya#可看到修改的代码。
source("sourceOmiccircos.R")
pdf('OmicCircos_plus.pdf',width = 8,height = 8)
par(mar=c(0, 0, 0, 0)); #准备画板
plot(c(1,800), c(1,800), type="n", axes=FALSE, xlab="", ylab="", main="");
#画染色体
circos_plus(R=250, cir="hg19", W=5, type="chr",
print.chr.lab=TRUE, scale=TRUE)
#画基因名字,side="in"则基因在圈里面
circos_plus(R=280, cir="hg19", W=10, mapping=gene_label, type="label", lwd = 0.4, col = gene_label$color,
side="out", cex=0.5);
#画第1个热图
circos_plus(R=210, cir="hg19", W=40, mapping=exp1, type="heatmap2",
cluster=FALSE, col.bar=FALSE, col=NULL,scale = TRUE)
## Warning in circos_plus(R = 210, cir = "hg19", W = 40, mapping = exp1, type
## = "heatmap2", : NAs introduced by coercion
#画第2个热图
circos_plus(R=160, cir="hg19", W=60, mapping=exp2, type="heatmap2",
cluster=FALSE, col.bar=FALSE, col=NULL,scale = TRUE)
## Warning in circos_plus(R = 160, cir = "hg19", W = 60, mapping = exp2, type
## = "heatmap2", : NAs introduced by coercion
#画第3个热图
circos_plus(R=110, cir="hg19", W=60, mapping=exp3, type="heatmap2",
cluster=FALSE, col.bar=FALSE, col=NULL,scale = TRUE)
## Warning in circos_plus(R = 110, cir = "hg19", W = 60, mapping = exp3, type
## = "heatmap2", : NAs introduced by coercion
#画link,col跟基因颜色一致
circos_plus(R=100, cir="hg19", W=10, mapping=gene_link2, type="link2", lwd=2, col='blue')
circos_plus(R=100, cir="hg19", W=10, mapping=gene_link1, type="link2", lwd=2, col='red')
dev.off()
## quartz_off_screen
## 2
sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS 10.14.2
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] parallel stats4 stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] bindrcpp_0.2.2 OmicCircos_1.20.0 GenomicRanges_1.34.0
## [4] GenomeInfoDb_1.18.1 IRanges_2.16.0 forcats_0.3.0
## [7] stringr_1.3.1 dplyr_0.7.8 purrr_0.3.0
## [10] readr_1.3.1 tidyr_0.8.2 tibble_2.0.1
## [13] ggplot2_3.1.0 tidyverse_1.2.1 S4Vectors_0.20.1
## [16] BiocGenerics_0.28.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_0.2.5 xfun_0.4 haven_2.0.0
## [4] lattice_0.20-38 colorspace_1.4-0 generics_0.0.2
## [7] htmltools_0.3.6 yaml_2.2.0 utf8_1.1.4
## [10] rlang_0.3.1 pillar_1.3.1 glue_1.3.0
## [13] withr_2.1.2 modelr_0.1.3 readxl_1.2.0
## [16] GenomeInfoDbData_1.2.0 bindr_0.1.1 plyr_1.8.4
## [19] zlibbioc_1.28.0 munsell_0.5.0 gtable_0.2.0
## [22] cellranger_1.1.0 rvest_0.3.2 evaluate_0.12
## [25] knitr_1.21 fansi_0.4.0 broom_0.5.1
## [28] Rcpp_1.0.0 scales_1.0.0 backports_1.1.3
## [31] XVector_0.22.0 jsonlite_1.6 hms_0.4.2
## [34] digest_0.6.18 stringi_1.2.4 grid_3.5.1
## [37] bitops_1.0-6 cli_1.0.1 tools_3.5.1
## [40] magrittr_1.5 RCurl_1.95-4.11 lazyeval_0.2.1
## [43] crayon_1.3.4 pkgconfig_2.0.2 xml2_1.2.0
## [46] lubridate_1.7.4 assertthat_0.2.0 rmarkdown_1.11
## [49] httr_1.4.0 rstudioapi_0.9.0 R6_2.3.0
## [52] nlme_3.1-137 compiler_3.5.1